In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import lifelines

Import Data


In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/fclesio/learning-space/master/Datasets/02%20-%20Classification/default_credit_card.csv')

In [3]:
df.head()


Out[3]:
ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 ... BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 DEFAULT
0 1 20000 2 2 1 24 2 2 -1 -1 ... 0 0 0 0 689 0 0 0 0 1
1 2 120000 2 2 2 26 -1 2 0 0 ... 3272 3455 3261 0 1000 1000 1000 0 2000 1
2 3 90000 2 2 2 34 0 0 0 0 ... 14331 14948 15549 1518 1500 1000 1000 1000 5000 0
3 4 50000 2 2 1 37 0 0 0 0 ... 28314 28959 29547 2000 2019 1200 1100 1069 1000 0
4 5 50000 1 2 1 57 -1 0 -1 0 ... 20940 19146 19131 2000 36681 10000 9000 689 679 0

5 rows × 25 columns


In [53]:
loans_X = df.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23]]

In [54]:
loans_Y = df.iloc[:,[24]]

In [55]:
loans_X.head()


Out[55]:
ID LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 ... BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6
0 1 20000 2 2 1 24 2 2 -1 -1 ... 689 0 0 0 0 689 0 0 0 0
1 2 120000 2 2 2 26 -1 2 0 0 ... 2682 3272 3455 3261 0 1000 1000 1000 0 2000
2 3 90000 2 2 2 34 0 0 0 0 ... 13559 14331 14948 15549 1518 1500 1000 1000 1000 5000
3 4 50000 2 2 1 37 0 0 0 0 ... 49291 28314 28959 29547 2000 2019 1200 1100 1069 1000
4 5 50000 1 2 1 57 -1 0 -1 0 ... 35835 20940 19146 19131 2000 36681 10000 9000 689 679

5 rows × 24 columns


In [56]:
loans_Y.head()


Out[56]:
DEFAULT
0 1
1 1
2 0
3 0
4 0

In [57]:
###
### Generate Training and Testing Set 
###
from sklearn import cross_validation

"""
    X_train: independent (target) variables for train data set
    Y_train: dependent (outcome) variable for train data set
   
    X_test: independent (target) variables for the test data set
    Y_test: dependent (outcome) variable for the test data set
"""
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(
    loans_X, loans_Y, test_size=0.2, random_state=0)

In [ ]: